#include <algorithm>
#include <execution>
#include <iostream>
#include <iterator>
#include <locale>
#include <numeric>
#include <vector>

// to parallelize non-associate accumulative operation, you'd better choose
// transform_reduce instead of reduce; e.g., a + b * b != b + a * a
void print_sum_squared(long const num) {
  std::cout.imbue(std::locale{"en_US.UTF8"});
  std::cout << "num = " << num << '\n';

  // create an immutable vector filled with pattern: 1,2,3,4, 1,2,3,4 ...
  const std::vector<long> v{[n = num * 4] {
    std::vector<long> v;
    v.reserve(n);
    std::generate_n(std::back_inserter(v), n,
                    [i = 0]() mutable { return 1 + i++ % 4; });
    return v;
  }()};

  auto squared_sum = [](auto sum, auto val) { return sum + val * val; };

  auto sum1 = std::accumulate(v.cbegin(), v.cend(), 0L, squared_sum);
  std::cout << "accumulate(): " << sum1 << '\n';

  auto sum2 =
      std::reduce(std::execution::par, v.cbegin(), v.cend(), 0L, squared_sum);
  std::cout << "reduce(): " << sum2 << '\n';

  auto sum3 =
      std::transform_reduce(std::execution::par, v.cbegin(), v.cend(), 0L,
                            std::plus{}, [](auto val) { return val * val; });
  std::cout << "transform_reduce(): " << sum3 << '\n';

  std::cout << '\n';
}

int main() {
  print_sum_squared(1);
  print_sum_squared(1'000);
  print_sum_squared(1'000'000);
}
